library(ggplot2)
library(ggpubr)
library(CDM)
library(boot)
library(tidyverse)
library(dummy)
library(stringi)
library(stringr)
rm(list = ls())
x_pre <- read_csv("data\\OUTPUT.csv")
Parsed with column specification:
cols(
.default = col_character(),
SubjectID = [32mcol_double()[39m,
`Auto Score 1` = [32mcol_double()[39m,
`Auto Score 2` = [32mcol_double()[39m,
`Auto Score 3` = [32mcol_double()[39m,
`Auto Score 4` = [32mcol_double()[39m,
`Auto Score 5` = [32mcol_double()[39m,
`Auto Score 6` = [32mcol_double()[39m,
`Auto Score 7` = [32mcol_double()[39m,
`Auto Score 8` = [32mcol_double()[39m,
`Auto Score 9` = [32mcol_double()[39m,
`Auto Score 10` = [32mcol_double()[39m,
`Auto Score 11` = [32mcol_double()[39m,
`Auto Score 12` = [32mcol_double()[39m,
`Auto Score 13` = [32mcol_double()[39m,
`Auto Score 14` = [32mcol_double()[39m,
`Auto Score 15` = [32mcol_double()[39m,
`Auto Score 16` = [32mcol_double()[39m,
`Auto Score 17` = [32mcol_double()[39m,
`Auto Score 18` = [32mcol_double()[39m,
`Auto Score 19` = [32mcol_double()[39m
# ... with 34 more columns
)
See spec(...) for full column specifications.
Q <- read_csv("data\\Q.csv")
Parsed with column specification:
cols(
.default = col_double()
)
See spec(...) for full column specifications.
#glimpse(x_pre)
head(x_pre)
NA
x.gather <-x_pre %>% gather(key = "key", value = "value", -File, -SubjectID)
x.gather
x.questions <-
x.gather %>% filter(str_detect(key, "Question"))
x.questions.dist <- x.questions %>% distinct(value) %>% drop_na() %>% mutate(Q_UNIQUE_ID = row_number())
x.questions.dist %>% write_csv("data\\Q_distinct_id.csv")
x.questions.dist
NA
x.answers <-
x.gather %>% filter(!str_detect(key, "Question"))
x.answers
x.questions %>% distinct(key)
x.answers %>% distinct(key)
x.questions.id <- x.questions %>% inner_join(x.questions.dist) %>% mutate(Q_UNIQUE_ID = factor(Q_UNIQUE_ID))
Joining, by = "value"
x.questions.id
#x.questions.id %>% mutate(var = 1) %>% select(-key) %>% spread(key = "Q_UNIQUE_ID", value = "var")
x.questions.id[c(2596, 9789),]
NA
x.questions.id[c(1330, 3101),]
NA
x.questions.id[c(8679, 11543),]
NA
x.questions.id[c(1871, 6917),]
NA
x.questions.id[c(1458, 5003),]
NA
x.questions.id[c(3221, 6926),]
x.questions.id.filterd <- x.questions.id %>% anti_join(x.questions.id %>% group_by(File, SubjectID, value) %>% summarise(cnt = n(), question_number = paste(key, collapse = ",")) %>% filter(cnt > 1) %>% ungroup(), by = "value")
x.questions.id.filterd
NA
We have the correct Questions. Now we need to add marks of answers against the questions.
X.pre <- x.questions.id.filterd %>% mutate(id = str_split(key, " ", simplify = TRUE)[,2]) %>%
inner_join(
x.answers %>% mutate(id = str_split(key, " ", simplify = TRUE)[,3]), by = c("File", "SubjectID", "id")
) %>% mutate(value.y = as.integer(value.y))
write_csv(X.pre, "X_Pre.csv")
X.pre
X<- X.pre %>% select(-key.x, -key.y, -value.x, -id ) %>%
mutate(Q_UNIQUE_ID = as.integer(Q_UNIQUE_ID)) %>%
semi_join(
Q %>% distinct(Q_UNIQUE_ID)
) %>% spread(key = "Q_UNIQUE_ID", value = "value.y")
Joining, by = "Q_UNIQUE_ID"
write_csv(X, "X.csv")
X
Let’s run some test to verify X
X %>% select(-File, -SubjectID) %>% summarise_all(sum, na.rm = TRUE)
NA
X %>% gather(key = "QuestionID", value = "Score", -File, -SubjectID)
NA
X %>% gather(key = "QuestionID", value = "Score", -File, -SubjectID) %>% filter(File == "Exam1Trial1") %>% mutate(Score = fct_explicit_na(as.character(Score))) %>% group_by(SubjectID, Score) %>% tally() %>%
ggplot() +
aes(x=SubjectID, y=n, fill = Score) +
geom_col(position = position_dodge2()) + facet_wrap(Score~., scales = "free")
NA
NA
X %>% gather(key = "QuestionID", value = "Score", -File, -SubjectID) %>% filter(File == "Exam1Trial1") %>% mutate(Score = fct_explicit_na(as.character(Score))) %>% filter(Score != "(Missing)") %>% group_by(SubjectID, Score) %>% tally() %>%
ggplot() +
aes(x=SubjectID, y=n, fill = Score) +
geom_col(position = position_stack())
NA
NA
X %>% gather(key = "QuestionID", value = "Score", -File, -SubjectID) %>% filter(File == "Exam1Trial1") %>% mutate(Score = fct_explicit_na(as.character(Score))) %>% filter(Score != "(Missing)") %>% group_by(QuestionID, Score) %>% tally() %>% filter(n > 4) %>%
ggplot() +
aes(x=QuestionID, y=n, fill = Score) +
geom_col(position = position_stack()) +
stat_mean() + facet_wrap(Score~.) + coord_flip()
NA
NA
X %>% gather(key = "QuestionID", value = "Score", -File, -SubjectID) %>% filter(File == "Exam1Trial1") %>% mutate(Score = fct_explicit_na(as.character(Score))) %>% group_by(SubjectID, Score) %>% tally() %>% filter(n > 4) %>% ungroup() %>%
ggplot() +
aes(x=SubjectID, y=n, fill = Score) +
geom_col(position = position_stack()) +
stat_summary(fun.y = min, geom = "line") +
stat_mean() + facet_grid(Score~., scales = "free")
NA
NA
library(janitor)
X %>% filter(File == "Exam1Trial1") %>% remove_empty(.,which = "cols")
NA
#Quantify Sparsity
X %>% filter(File == "Exam1Trial1") %>% remove_empty(.,which = "cols") %>%
gather(key = "QuestionID", value = "Scores", -File, -SubjectID) %>%
group_by(File, SubjectID) %>%
summarise(total_na = sum(is.na(Scores)), total = n(), total_attempted = total - total_na) %>%
ggplot() + aes(x=SubjectID, y = total_attempted) + geom_col() + geom_hline(aes(yintercept = min(total_attempted))) + geom_hline(aes(yintercept = max(total_attempted)))
NA
NA
#Question attempts for Exam1Trial1
X %>% filter(File == "Exam1Trial1") %>% remove_empty(.,which = "cols") %>%
gather(key = "QuestionID", value = "Scores", -File, -SubjectID) %>%
group_by(File, QuestionID) %>%
summarise(total_na = sum(is.na(Scores)), total = n(), total_attempted = total - total_na) %>%
ggplot() + aes(x=QuestionID, y = total_attempted) + geom_col() +
geom_hline(aes(yintercept = min(total_attempted))) + geom_hline(aes(yintercept = max(total_attempted))) +
coord_flip()
NA
NA
#Question attempts for All Trials
X %>% remove_empty(.,which = "cols") %>%
gather(key = "QuestionID", value = "Scores", -File, -SubjectID) %>%
group_by(File, QuestionID) %>%
summarise(total_na = sum(is.na(Scores)), total = n(), total_attempted = total - total_na) %>%
ggplot() + aes(x=QuestionID, y = total_attempted) + geom_col() +
geom_hline(aes(yintercept = min(total_attempted))) + geom_hline(aes(yintercept = max(total_attempted))) +
facet_grid(.~File, scales = "free")
NA
NA
question_attempted <- X %>% remove_empty(.,which = "cols") %>%
gather(key = "QuestionID", value = "Scores", -File, -SubjectID) %>%
group_by(File, QuestionID) %>%
summarise(total_na = sum(is.na(Scores)), total = n(), total_attempted = total - total_na)
question_attempted <- question_attempted %>% filter(total_attempted >= 8)
question_attempted
#%>% filter(QuestionID == "103")
Filtering out questions with lesser attempts
X_filtered <- X %>% remove_empty(.,which = "cols") %>%
gather(key = "QuestionID", value = "Scores", -File, -SubjectID) %>% semi_join(question_attempted, by = c("File", "QuestionID")) %>%
spread(key = "QuestionID", value = "Scores")
X_filtered
X %>% remove_empty(.,which = "cols") %>% write_csv("data\\X.csv")
X_filtered %>% remove_empty(.,which = "cols") %>% write_csv("data\\X_filtered.csv")
Write CSVs seperate for each trial to avoid having columns for those questions that were not asked in a trial. This will help to show the true picture of sparsity.
fn.clean <- function (df) {
return(df %>% remove_empty(.,which = "cols"))
}
X.individual.list <- X %>%
nest(-File) %>%
mutate(data_clean = map(data, fn.clean))
X.individual.list
[38;5;246m# A tibble: 8 x 3[39m
File data data_clean
[3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<list>[39m[23m [3m[38;5;246m<list>[39m[23m
[38;5;250m1[39m Exam1Trial1 [38;5;246m<tibble [74 x 940]>[39m [38;5;246m<tibble [74 x 287]>[39m
[38;5;250m2[39m Exam1Trial2 [38;5;246m<tibble [57 x 940]>[39m [38;5;246m<tibble [57 x 278]>[39m
[38;5;250m3[39m Exam2Trial1 [38;5;246m<tibble [66 x 940]>[39m [38;5;246m<tibble [66 x 236]>[39m
[38;5;250m4[39m Exam2Trial2 [38;5;246m<tibble [67 x 940]>[39m [38;5;246m<tibble [67 x 237]>[39m
[38;5;250m5[39m Exam3Trial1 [38;5;246m<tibble [47 x 940]>[39m [38;5;246m<tibble [47 x 178]>[39m
[38;5;250m6[39m Exam3Trial2 [38;5;246m<tibble [78 x 940]>[39m [38;5;246m<tibble [78 x 179]>[39m
[38;5;250m7[39m Exam4Trial1 [38;5;246m<tibble [64 x 940]>[39m [38;5;246m<tibble [64 x 239]>[39m
[38;5;250m8[39m Exam4Trial2 [38;5;246m<tibble [72 x 940]>[39m [38;5;246m<tibble [72 x 239]>[39m
fn.write <- function(key, data) {
print(data)
data %>% write_csv(paste0("data\\",key,".csv"))
}
walk2(X.individual.list$File, X.individual.list$data_clean, fn.write)
NA
NA
X %>% filter(File == "Exam1Trial1") %>% remove_empty(.,which = "cols")
NA
X %>% filter(File == "Exam1Trial2") %>% remove_empty(.,which = "cols")
NA
X %>% filter(File == "Exam2Trial1") %>% remove_empty(.,which = "cols") %>%
gather(key="Questions", value = "Answers", -File, -SubjectID) %>%
mutate(Answers = fct_explicit_na(as.character (Answers))) %>%
ggplot() + aes(x = Questions, fill = Answers) + geom_bar(position = position_dodge2()) + facet_wrap(Answers ~. , scales = "free")
X %>% filter(File == "Exam2Trial1") %>% remove_empty(.,which = "cols") %>%
gather(key="Questions", value = "Answers", -File, -SubjectID) %>%
mutate(Answers = fct_explicit_na(as.character (Answers))) %>%
ggplot() + aes(x = Answers) + geom_bar(position = position_dodge2()) + facet_wrap(Answers ~. , scales = "free")
Q <- read_csv("data\\Q.csv")
Parsed with column specification:
cols(
.default = col_double()
)
See spec(...) for full column specifications.
Q
NA
fn.skills <- function (df) {
df <- df %>% remove_empty(.,which = "cols") %>%
gather(key = "Q_UNIQUE_ID", value = "Score", -SubjectID) %>%
mutate(Q_UNIQUE_ID = as.integer(Q_UNIQUE_ID)) %>% distinct(Q_UNIQUE_ID) %>%
inner_join(
Q
) %>% remove_empty(.,which = "cols")
return(df)
}
X.Q <- X.individual.list %>%
mutate(data_Q_skills = map(data_clean, fn.skills))
Joining, by = "Q_UNIQUE_ID"
Joining, by = "Q_UNIQUE_ID"
Joining, by = "Q_UNIQUE_ID"
Joining, by = "Q_UNIQUE_ID"
Joining, by = "Q_UNIQUE_ID"
Joining, by = "Q_UNIQUE_ID"
Joining, by = "Q_UNIQUE_ID"
Joining, by = "Q_UNIQUE_ID"
X.Q
[38;5;246m# A tibble: 8 x 4[39m
File data data_clean data_Q_skills
[3m[38;5;246m<chr>[39m[23m [3m[38;5;246m<list>[39m[23m [3m[38;5;246m<list>[39m[23m [3m[38;5;246m<list>[39m[23m
[38;5;250m1[39m Exam1Trial1 [38;5;246m<tibble [74 x 940]>[39m [38;5;246m<tibble [74 x 287]>[39m [38;5;246m<tibble [286 x 25]>[39m
[38;5;250m2[39m Exam1Trial2 [38;5;246m<tibble [57 x 940]>[39m [38;5;246m<tibble [57 x 278]>[39m [38;5;246m<tibble [277 x 25]>[39m
[38;5;250m3[39m Exam2Trial1 [38;5;246m<tibble [66 x 940]>[39m [38;5;246m<tibble [66 x 236]>[39m [38;5;246m<tibble [235 x 17]>[39m
[38;5;250m4[39m Exam2Trial2 [38;5;246m<tibble [67 x 940]>[39m [38;5;246m<tibble [67 x 237]>[39m [38;5;246m<tibble [236 x 17]>[39m
[38;5;250m5[39m Exam3Trial1 [38;5;246m<tibble [47 x 940]>[39m [38;5;246m<tibble [47 x 178]>[39m [38;5;246m<tibble [177 x 14]>[39m
[38;5;250m6[39m Exam3Trial2 [38;5;246m<tibble [78 x 940]>[39m [38;5;246m<tibble [78 x 179]>[39m [38;5;246m<tibble [178 x 14]>[39m
[38;5;250m7[39m Exam4Trial1 [38;5;246m<tibble [64 x 940]>[39m [38;5;246m<tibble [64 x 239]>[39m [38;5;246m<tibble [238 x 14]>[39m
[38;5;250m8[39m Exam4Trial2 [38;5;246m<tibble [72 x 940]>[39m [38;5;246m<tibble [72 x 239]>[39m [38;5;246m<tibble [238 x 14]>[39m
#X %>% filter(File == "Exam1Trial1") %>% remove_empty(.,which = "cols") %>%
# gather(key = "Q_UNIQUE_ID", value = "Score", -File, -SubjectID) %>%
# mutate(Q_UNIQUE_ID = as.integer(Q_UNIQUE_ID)) %>% distinct(Q_UNIQUE_ID) %>%
#
# inner_join(
# Q
#
# ) %>% remove_empty(.,which = "cols")